package com.joehongfa.util;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class GetDocumentUtil implements PageProcessor {
	private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

	@Override
	public void process(Page page) {
		page.addTargetRequests(page.getHtml().links().regex("http://www.\\w+.com").all());
		// page.putField("test", page.getHtml().getDocument());
		// System.out.println("2F");
		// if (page.getHtml().links().regex("http://www.4hu12.com").match()) {
		// System.out.println("1");
		SaveToFileUtil.SaveAsType(page.getHtml().getDocument(), "html");
		// System.out.println("2");
		// }

	}

	private void sb() {
		System.out.println("sb");
		// SaveToFileUtil.SaveAsType(page.getHtml().getDocument(), "html",
		// null);
	}

	@Override
	public Site getSite() {
		return site;
	}

	public static void spiderThis(String url) {
		Spider.create(new GetDocumentUtil()).addUrl(url).thread(5).run();
	}
}
